import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
ben = pd.read_csv('data/Train_Beneficiarydata-1542865627584.csv')
inp = pd.read_csv('data/Train_Inpatientdata-1542865627584.csv')
out = pd.read_csv('data/Train_Outpatientdata-1542865627584.csv')
label = pd.read_csv('data/Train-1542865627584.csv')
print(ben.shape)
ben.sample(2)
print(inp.shape)
inp.sample(2)
print(out.shape)
out.sample(2)
print('columns in inpatient but not in outpatient: ', [i for i in inp.columns if i not in out.columns])
print('columns in outpatient but not in inpatient: ', [i for i in out.columns if i not in inp.columns])
inp['ClmType'] = 'inpatient'
out['ClmType'] = 'outpatient'
clm = pd.concat([inp, out], sort=False)
print(clm.shape)
clm.sample(2)
df = pd.merge(clm, ben, how='left', on='BeneID')
# df = pd.merge(df, label, how='left', on='Provider')
df = df.sort_values('ClaimStartDt').reset_index() # sort df by claim start date
df.drop('index', axis = 1, inplace = True)
print(df.shape)
df.head()
# understand the quantity of misisng values
missing_values_count = df.isnull().sum().sort_values(ascending=False)
missing_values_per = round((missing_values_count/len(df)) * 100,2)
miss = pd.concat([missing_values_count, missing_values_per], axis=1, keys = ['Quant Missing values', 'Percentage Missing values - %']).reset_index()
miss[miss['Percentage Missing values - %']!=0]
Note: Only inpatient data has columns AdmissionDt, DischargeDt and DiagnosisGroupCode.Actually no missing value for these 3 columns if only consider for inpatient data.
# # drop columns with missing value over 50%
# drop_list = [i for i in miss[miss['Percentage Missing values - %']>50]['index'] if i not in ['AdmissionDt', 'DischargeDt', 'DiagnosisGroupCode']]
# df.drop(drop_list, axis = 1, inplace = True)
# distinct number of provider
df.Provider.nunique()
After merging and cleaning the data, we have 558,211 claim records for 5,410 distinct provider. There are 38 columns in total, including the targeted variable "PotentialFraud".
demo_list = list(ben.columns)
demo_list.append('Provider')
demo = df[demo_list]
demo.info()
demo.DOB = pd.to_datetime(demo.DOB, format = '%Y-%m-%d')
demo.DOD = pd.to_datetime(demo.DOD, format = '%Y-%m-%d')
demo['isDead']= 0
demo.loc[demo.DOD.notna(),'isDead'] = 1
demo= demo.replace({'ChronicCond_Alzheimer': 2, 'ChronicCond_Heartfailure': 2, 'ChronicCond_KidneyDisease': 2,
'ChronicCond_Cancer': 2, 'ChronicCond_ObstrPulmonary': 2, 'ChronicCond_Depression': 2,
'ChronicCond_Diabetes': 2, 'ChronicCond_IschemicHeart': 2, 'ChronicCond_Osteoporasis': 2,
'ChronicCond_rheumatoidarthritis': 2, 'ChronicCond_stroke': 2, 'Gender': 2 }, 0)
demo = demo.replace({'RenalDiseaseIndicator': 'Y'}, 1).astype({'RenalDiseaseIndicator': 'int64'})
df1 = demo.groupby(['Provider'], as_index = False)[['Gender', 'RenalDiseaseIndicator',
'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
'ChronicCond_stroke', 'isDead']].sum()
CountBene = demo[['BeneID']].groupby(demo['Provider']).count().reset_index().rename(columns={'BeneID':'CountBene'})
df1 = df1.merge(CountBene, on='Provider', how='left')
df1.loc[:, df1.columns != 'Provider'] = df1.iloc[:,1:].div(df1.CountBene, axis=0)
df1.drop('CountBene', axis = 1, inplace = True)
df2 = demo[['BeneID']].groupby(demo['Provider']).nunique().reset_index().rename(columns={'BeneID':'DistinctBene'})
df3 = demo.groupby(['Provider'], as_index = False)[['NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt']].mean()
demo_agg = df2.merge(df1, on='Provider', how='left').merge(df3, on='Provider', how='left').merge(label, on='Provider', how='left')
demo_agg = demo_agg.replace({'PotentialFraud': 'No'}, 0)
demo_agg = demo_agg.replace({'PotentialFraud': 'Yes'}, 1)
demo_agg.sample(2)
demo_agg.describe()
# demo_agg.to_csv('demo_agg.csv')
def heatMap(df):
#Create Correlation df
corr = df.corr()
#Plot figsize
fig, ax = plt.subplots(figsize=(20, 20))
#Generate Color Map
colormap = sns.diverging_palette(220, 10, as_cmap=True)
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
#Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
#show plot
plt.show()
heatMap(demo_agg)
# plot continuous feature
def plot_con(df, col_name, full_name, ratio=True):
# Plot without ProtentialFraud
if ratio==False:
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,4), dpi=90)
sns.distplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
ax1.set_xlabel(full_name)
ax1.set_ylabel('Count')
ax1.set_title(full_name)
else:
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10,4), dpi=90)
sns.boxplot(y=col_name, data=df, ax=ax1)
ax1.set_xlabel(full_name)
ax1.set_ylabel('Ratio')
ax1.set_title(full_name)
# Plot with ProtentialFraud
sns.boxplot(x='PotentialFraud', y=col_name, data=df, ax=ax2)
ax2.set_ylabel(full_name)
ax2.set_title(full_name + ' by PotentailFault or Not')
ax2.set_xlabel('PotentailFault or Not')
plt.tight_layout()
demo_agg.columns
ChronicCond_list = ['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
'ChronicCond_stroke']
ChronicCond = demo_agg[ChronicCond_list+['PotentialFraud']]
ChronicCond = pd.melt(ChronicCond, id_vars=['PotentialFraud'], value_vars=ChronicCond_list)
ChronicCond = ChronicCond.replace({'ChronicCond_':''}, regex=True)
ax = sns.catplot(data=ChronicCond, x='variable', y='value', hue='PotentialFraud', kind='box', height = 4, aspect = 4)
ax.fig.subplots_adjust(top=0.9)
ax.fig.suptitle('Chronic Condition By Potential Fraud', fontsize=18)
# DistinctBene = demo_agg[demo_agg['DistinctBene']<=demo_agg.DistinctBene.quantile(0.99)]
plot_con(demo_agg, 'DistinctBene', 'Distinct Number of Beneficiary', ratio=False)
plot_con(demo_agg, 'Gender', 'Ratio of Male')
plot_con(demo_agg, 'RenalDiseaseIndicator', 'Renal Disease')
plot_con(demo_agg, 'IPAnnualReimbursementAmt', 'IPAnnualReimbursementAmt', ratio=False)
plot_con(demo_agg, 'IPAnnualDeductibleAmt', 'IPAnnualDeductibleAmt', ratio=False)
plot_con(demo_agg, 'OPAnnualReimbursementAmt', 'OPAnnualReimbursementAmt', ratio=False)
plot_con(demo_agg, 'OPAnnualDeductibleAmt', 'OPAnnualDeductibleAmt', ratio=False)
# load the package
from datetime import date
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (8, 6)
sns.set()
import warnings
warnings.filterwarnings('ignore')
# load the data
patient = pd.read_csv('Train_Beneficiarydata-1542865627584.csv')
inpatient = pd.read_csv('Train_Inpatientdata-1542865627584.csv')
outpatient = pd.read_csv('Train_Outpatientdata-1542865627584.csv')
fraud = pd.read_csv('Train-1542865627584.csv', index_col = 'Provider')
# map 'No' to 0 and 'Yes' to 1
fraud_map = {'No': 0, 'Yes': 1}
fraud.PotentialFraud = fraud.PotentialFraud.map(fraud_map)
# plot potential fraud distribution
sns.countplot(x = 'PotentialFraud', data = fraud)
plt.title('Potential Provider Fraud Distribution')
plt.show()
inpatient.drop(['DiagnosisGroupCode', 'ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1',
'ClmDiagnosisCode_10', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6'], axis = 1, inplace = True)
outpatient.drop(['ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1',
'ClmDiagnosisCode_10', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6'], axis = 1, inplace = True)
inpatient.shape
outpatient.shape
# check missing values
def missing_val(df):
missing = df.isnull().sum()
missing_percentage = (df.isnull().sum()/len(df)*100).round(2)
missing_val = pd.concat([missing, missing_percentage], axis = 1)
missing_val.columns = ['Missing Values', '% Missing']
total_columns = df.shape[1]
missing_columns = (df.isnull().sum() > 0).sum()
print('Out of {} columns, {} columns have missing values'.format(total_columns, missing_columns))
return missing_val
missing_val(inpatient)
missing_val(outpatient)
# merge data
inpatient['In'] = 1
outpatient['Out'] = 1
patientDOB = patient[['BeneID', 'DOB']]
claims = inpatient.append(outpatient)
claims = claims.merge(patientDOB, how = 'left', on = 'BeneID')
claims.shape
claims.info()
claims.head()
# convert all date variables to datetime datatype
claims.DOB = pd.to_datetime(claims.DOB)
claims.AdmissionDt = pd.to_datetime(claims.AdmissionDt)
claims.DischargeDt = pd.to_datetime(claims.DischargeDt)
claims.ClaimStartDt = pd.to_datetime(claims.ClaimStartDt)
claims.ClaimEndDt = pd.to_datetime(claims.ClaimEndDt)
# derive patient age
claims['Age'] = round(((claims.ClaimStartDt - claims.DOB).dt.days)/365, 0)
claims.describe(include = 'all')
# Only keep 2009 data
claims = claims.loc[claims.ClaimStartDt >= date(2009, 1, 1)]
claims.describe(include = 'all')
Age
sns.distplot(claims.Age)
plt.title('Distribution of Patients Age')
plt.show()
# average age of patients
AvgAge = claims.groupby('Provider').agg({'Age': 'mean'})
AvgAge.columns = ['AvgAge']
AvgAge.head()
AdmissionDT, DischargeDt
# inpatient days
claims['InpatientDays'] = (claims.DischargeDt - claims.AdmissionDt).dt.days
sns.distplot(claims.InpatientDays.dropna())
plt.title('Distribution of the Number of Inpatient Days')
plt.show()
def groupby_provider(col, func, freq = 'M'):
temp = claims[['ClaimStartDt', 'Provider', col]]
temp.set_index('ClaimStartDt', inplace = True)
group = pd.DataFrame(temp.groupby([pd.Grouper(freq='M'), 'Provider']).agg(func))
group.reset_index(inplace = True)
group = group.pivot(index = 'Provider', columns = 'ClaimStartDt', values = col)
group.columns = group.columns.strftime('%Y-%m-%d')
return group
# average number of inpatient days per month
InpatientDays = groupby_provider('InpatientDays', 'mean')
InpatientDays = InpatientDays.add_prefix('InDays')
InpatientDays.fillna(0, inplace = True)
InpatientDays.head()
# standard deviation
InpatientDays_Std = pd.DataFrame(InpatientDays.std(axis = 1), columns = ['InDaysStd'])
InpatientDays_Std.head()
# mean
InpatientDays_Mean = pd.DataFrame(InpatientDays.mean(axis = 1), columns = ['InDaysMean'])
InpatientDays_Mean.head()
# total inpatient days
InpatientDays_Sum = claims.groupby('Provider').agg({'InpatientDays': sum})
InpatientDays_Sum.columns = ['TotalInDays']
InpatientDays_Sum.head()
AttendingPhysician
AttendingPhysician = pd.DataFrame(claims.groupby('Provider')['AttendingPhysician'].nunique())
AttendingPhysician.head()
BeneID
# number of unique patients per month
UniqueBeneID = groupby_provider('BeneID', pd.Series.nunique)
UniqueBeneID = UniqueBeneID.add_prefix('PatientUNo')
UniqueBeneID.fillna(0, inplace = True)
UniqueBeneID.head()
# standard deviation
unibene_Std = pd.DataFrame(UniqueBeneID.std(axis = 1), columns = ['UniBeneStd'])
unibene_Std.head()
# mean
unibene_Mean = pd.DataFrame(UniqueBeneID.mean(axis = 1), columns = ['UniBeneMean'])
unibene_Mean.head()
# total patients
total_patients = claims.groupby('Provider').agg({'BeneID': 'nunique'})
total_patients.columns = ['TotalPatients']
total_patients.head()
ClaimStartDt, ClaimEndDt
# create variable: claim duration
claims['ClaimDays'] = (claims.ClaimEndDt - claims.ClaimStartDt).dt.days
sns.distplot(claims.ClaimDays)
plt.title('Distribution of the Number of Days between Claim Start and Claim End')
plt.show()
# Inpatient and Outpatient
figure, axes = plt.subplots(1, 2, figsize=(14, 5), sharex = True)
sns.distplot(claims.loc[claims.In == 1, 'ClaimDays'], ax = axes[0])
axes[0].set_title('Inpatient')
sns.distplot(claims.loc[claims.Out == 1, 'ClaimDays'], ax = axes[1])
axes[1].set_title('Outpatient')
plt.suptitle('Distribution of the Number of Days between Claim Start and Claim End', fontsize = 15)
plt.show()
# monthly average claim days
ClaimDays = groupby_provider('ClaimDays', 'mean')
ClaimDays = ClaimDays.add_prefix('ClaimDays')
ClaimDays.fillna(0, inplace = True)
ClaimDays.head()
# standard deviation
ClaimDays_Std = pd.DataFrame(ClaimDays.std(axis = 1), columns = ['ClaimDaysStd'])
ClaimDays_Std.head()
# mean
ClaimDays_Mean = pd.DataFrame(ClaimDays.mean(axis = 1), columns = ['ClaimDaysMean'])
ClaimDays_Mean.head()
ClaimID
# number of claims per month
ClaimID = groupby_provider('ClaimID', 'count')
ClaimID = ClaimID.add_prefix('ClaimNo')
ClaimID.fillna(0, inplace = True)
ClaimID.head()
# standard deviation
ClaimNum_Std = pd.DataFrame(ClaimID.std(axis = 1), columns = ['ClaimNumStd'])
ClaimNum_Std.head()
# mean
ClaimNum_Mean = pd.DataFrame(ClaimID.mean(axis = 1), columns = ['ClaimNumMean'])
ClaimNum_Mean.head()
# total claims
total_claim = claims.groupby('Provider').agg({'ClaimID': 'count'})
total_claim.columns = ['TotalClaim']
total_claim.head()
DeductibleAmtPaid
# average deductible amount paid per month
DeductibleAmtPaid = groupby_provider('DeductibleAmtPaid', 'mean')
DeductibleAmtPaid = DeductibleAmtPaid.add_prefix('Deductible')
DeductibleAmtPaid.fillna(0, inplace = True)
DeductibleAmtPaid.head()
# standard deviation
Deductible_Std = pd.DataFrame(DeductibleAmtPaid.std(axis = 1), columns = ['DeductibleStd'])
Deductible_Std.head()
# mean
Deductible_Mean = pd.DataFrame(DeductibleAmtPaid.mean(axis = 1), columns = ['DeductibleMean'])
Deductible_Mean.head()
# total deductible
total_deductible = claims.groupby('Provider').agg({'DeductibleAmtPaid': sum})
total_deductible.columns = ['TotalDeductible']
total_deductible.head()
In
# number of inpatient patients per month
In = groupby_provider('In', 'count')
In = In.add_prefix('In')
In.fillna(0, inplace = True)
In.head()
# standard deviation
In_Std = pd.DataFrame(In.std(axis = 1), columns = ['InStd'])
In_Std.head()
# mean
In_Mean = pd.DataFrame(In.mean(axis = 1), columns = ['InMean'])
In_Mean.head()
# total inpatients
total_in = claims.groupby('Provider').agg({'In': sum})
total_in.columns = ['TotalInpatients']
total_in.head()
InscClaimAmtReimbursed
# average reimbursement amount paid per month
InscClaimAmtReimbursed = groupby_provider('InscClaimAmtReimbursed', 'mean')
InscClaimAmtReimbursed = InscClaimAmtReimbursed.add_prefix('Reimbursed')
InscClaimAmtReimbursed.fillna(0, inplace = True)
InscClaimAmtReimbursed.head()
# standard deviation
Insc_Std = pd.DataFrame(InscClaimAmtReimbursed.std(axis = 1), columns = ['InscStd'])
Insc_Std.head()
# mean
Insc_Mean = pd.DataFrame(InscClaimAmtReimbursed.mean(axis = 1), columns = ['InscMean'])
Insc_Mean.head()
# total insurance paid amount
total_insc = claims.groupby('Provider').agg({'InscClaimAmtReimbursed': sum})
total_insc.columns = ['TotalInsc']
total_insc.head()
OperatingPhysician
OperatingPhysician = pd.DataFrame(claims.groupby('Provider')['OperatingPhysician'].nunique())
OperatingPhysician.head()
OtherPhysician
OtherPhysician = pd.DataFrame(claims.groupby('Provider')['OtherPhysician'].nunique())
OtherPhysician.head()
Out
# number of outpatient patients per month
Out = groupby_provider('Out', 'count')
Out = Out.add_prefix('Out')
Out.fillna(0, inplace = True)
Out.head()
# standard deviation
Out_Std = pd.DataFrame(Out.std(axis = 1), columns = ['OutStd'])
Out_Std.head()
# mean
Out_Mean = pd.DataFrame(Out.mean(axis = 1), columns = ['OutMean'])
Out_Mean.head()
# total outpatients
total_out = claims.groupby('Provider').agg({'Out': sum})
total_out.columns = ['TotalOutpatients']
total_out.head()
df = AttendingPhysician.join(AvgAge)
df = df.join(InpatientDays_Std)
df = df.join(InpatientDays_Mean)
df = df.join(InpatientDays_Sum)
df = df.join(unibene_Std)
df = df.join(unibene_Mean)
df = df.join(total_patients)
df = df.join(ClaimDays_Std)
df = df.join(ClaimDays_Mean)
df = df.join(ClaimNum_Std)
df = df.join(ClaimNum_Mean)
df = df.join(total_claim)
df = df.join(Deductible_Std)
df = df.join(Deductible_Mean)
df = df.join(total_deductible)
df = df.join(In_Std)
df = df.join(In_Mean)
df = df.join(total_in)
df = df.join(Insc_Std)
df = df.join(Insc_Mean)
df = df.join(total_insc)
df = df.join(OperatingPhysician)
df = df.join(OtherPhysician)
df = df.join(Out_Std)
df = df.join(Out_Mean)
df = df.join(total_out)
df = df.join(fraud)
df.shape
df.head()
missing_val(df)
def fraud_plot(df, col, title):
fig, ax = plt.subplots(3, figsize = (6, 18))
sns.distplot(np.log(df[col] + 1), ax = ax[0]).set_title('Distribution Plot')
sns.distplot(np.log(df.loc[df.PotentialFraud == 0, col] + 1), hist = False, color = 'b', label = 'No Fraud', kde_kws={'shade': True}, ax = ax[1])
sns.distplot(np.log(df.loc[df.PotentialFraud == 1, col] + 1), hist = False, color = 'r', label = 'Fraud', kde_kws={'shade': True}, ax = ax[1]).set_title('Comparison Distribution Plot')
sns.boxplot(x = df.PotentialFraud, y = np.log(df[col] + 1), ax = ax[2]).set_title('Box Plot')
plt.suptitle(title, fontsize = 15)
plt.show()
AttendingPhysician
fraud_plot(df, 'AttendingPhysician', 'Number of Attending Physicians')
Inpatient Days
fraud_plot(df, 'InDaysStd', 'Standard Deviation of Inpatient Days')
Number of Patients
fraud_plot(df, 'UniBeneStd', 'Standard Deviation of Monthly Number of Unique Patients')
ClaimDays
fraud_plot(df, 'ClaimDaysStd', 'Standard Deviation of Claims Days')
ClaimNumber
fraud_plot(df, 'ClaimNumStd', 'Standard Deviation of Monthly Number of Claims')
Deductible
fraud_plot(df, 'DeductibleStd', 'Standard Deviation of Average Monthly Deductible')
Number of Inpatients
fraud_plot(df, 'InStd', 'Standard Deviation of Average Monthly Inpatients')
Insurance Paid Amount
fraud_plot(df, 'InscStd', 'Standard Deviation of Average Monthly Insurance Paid Amount')
OperatingPhysician
fraud_plot(df, 'OperatingPhysician', 'Number of Operating Physicians')
OtherPhysician
fraud_plot(df, 'OtherPhysician', 'Number of Other Physicians')
Outpatients
fraud_plot(df, 'OutStd', 'Standard Deviation of Average Monthly Number of Outpatients')
fig, ax = plt.subplots(figsize=(20, 20))
corr = df.corr()
sns.heatmap(corr, mask = np.zeros_like(corr, dtype = np.bool), cmap = sns.diverging_palette(220, 10, as_cmap = True),
square = True, annot = True)
ax.set_ylim(len(corr)+0.5, -0.5)
plt.show()
df.to_csv('claims1.csv')
# load the package
from datetime import date
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (8, 6)
# load the data
inpatient = pd.read_csv('Train_Inpatientdata-1542865627584.csv')
outpatient = pd.read_csv('Train_Outpatientdata-1542865627584.csv')
fraud = pd.read_csv('Train-1542865627584.csv', index_col = 'Provider')
inpatient = inpatient[['ClaimID','DiagnosisGroupCode', 'ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1',
'ClmDiagnosisCode_10', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6','Provider',
'ClaimStartDt', 'ClaimEndDt']]
outpatient = outpatient[['ClaimID','ClmAdmitDiagnosisCode', 'ClmDiagnosisCode_1',
'ClmDiagnosisCode_10', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6',
'Provider', 'ClaimStartDt', 'ClaimEndDt']]
inpatient.shape
outpatient.shape
# map 'No' to 0 and 'Yes' to 1
fraud_map = {'No': 0, 'Yes': 1}
fraud.PotentialFraud = fraud.PotentialFraud.map(fraud_map)
diagnosis = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10']
procedure = ['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6']
diagnosisP = ['Provider','ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10']
procedureP = ['Provider','ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6']
# merging inpatient and outpatient
inpatient['In'] = 1
outpatient['Out'] = 1
claims = inpatient.append(outpatient)
# claims and patient and potential fraud
claims.shape
claims.head()
claims.info()
claims['Admit_Diagnosis_EVCodes'] = claims['ClmAdmitDiagnosisCode'].str.contains("V|E", na=False)
true_false = {False: 0, True: 1}
claims.Admit_Diagnosis_EVCodes = claims.Admit_Diagnosis_EVCodes.map(true_false)
claims['DiagnosisGroupCode'] = claims['DiagnosisGroupCode'].replace('OTH', np.nan)
#claims['DiagnosisGroupCode'].sort_values().unique()
claims = claims.astype({'DiagnosisGroupCode': 'float'})
claims['DiagnosisGroupCode'].describe()
claims['DiagnosisGroupCode'].head()
claims.ClaimStartDt = pd.to_datetime(claims.ClaimStartDt)
claims.ClaimEndDt = pd.to_datetime(claims.ClaimEndDt)
claims['diagnosis_count'] = claims[diagnosis].count(axis='columns')
claims['procedure_count'] = claims[procedure].count(axis='columns')
claims['Provider'].nunique()
claims.head()
new_claims = claims[['Provider','ClmAdmitDiagnosisCode']]
#new_claims.head()
#new_claims.set_index('Provider', inplace=True)
#new_claims = pd.concat([new_claims[col] for col in new_claims])
#new_claims = new_claims.to_frame().dropna().reset_index()
unique_admitcode = new_claims.groupby('Provider')['ClmAdmitDiagnosisCode'].nunique().to_frame().reset_index().rename(columns={'ClmAdmitDiagnosisCode': "unique_admitcode"})
#unique_diagnosis = new_claims.groupby('Provider')[0].nunique()
#unique_diagnosis = unique_diagnosis.to_frame().reset_index().rename(columns={0: "unique_diagnosis"})
unique_admitcode.head()
new_claims = claims[['Provider','DiagnosisGroupCode']]
#new_claims.head()
#new_claims.set_index('Provider', inplace=True)
#new_claims = pd.concat([new_claims[col] for col in new_claims])
#new_claims = new_claims.to_frame().dropna().reset_index()
unique_DRG = new_claims.groupby('Provider')['DiagnosisGroupCode'].nunique().to_frame().reset_index().rename(columns={'DiagnosisGroupCode': "unique_DRG"})
#unique_diagnosis = new_claims.groupby('Provider')[0].nunique()
#unique_diagnosis = unique_diagnosis.to_frame().reset_index().rename(columns={0: "unique_diagnosis"})
unique_DRG.head()
new_claims = claims[diagnosisP]
new_claims.set_index('Provider', inplace=True)
new_claims = pd.concat([new_claims[col] for col in new_claims])
new_claims = new_claims.to_frame().dropna().reset_index()
#new_claims.groupby('Provider')['0'].nunique()
unique_diagnosis = new_claims.groupby('Provider')[0].nunique().to_frame().reset_index().rename(columns={0: "unique_diagnosis"})
#fraud_diagnoses = pd.Series(diagnosis_df.to_numpy().flatten()).dropna()
unique_diagnosis.head()
unique_diagnosis.shape
new_claims = claims.loc[claims.In == 1, diagnosisP]
#new_claims = claims[diagnosisP]
new_claims.set_index('Provider', inplace=True)
new_claims = pd.concat([new_claims[col] for col in new_claims])
new_claims = new_claims.to_frame().dropna().reset_index()
unique_diagnosis_In = new_claims.groupby('Provider')[0].nunique()
unique_diagnosis_In = unique_diagnosis_In.to_frame().reset_index().rename(columns={0: "unique_diagnosis_In"})
unique_diagnosis_In.head()
new_claims = claims.loc[claims.Out == 1, diagnosisP]
#new_claims = claims[diagnosisP]
new_claims.set_index('Provider', inplace=True)
new_claims = pd.concat([new_claims[col] for col in new_claims])
new_claims = new_claims.to_frame().dropna().reset_index()
unique_diagnosis_Out = new_claims.groupby('Provider')[0].nunique()
unique_diagnosis_Out = unique_diagnosis_Out.to_frame().reset_index().rename(columns={0: "unique_diagnosis_Out"})
unique_diagnosis_Out.head()
new_claims = claims[procedureP]
new_claims.set_index('Provider', inplace=True)
new_claims = pd.concat([new_claims[col] for col in new_claims])
new_claims = new_claims.to_frame().dropna().reset_index()
#new_claims.groupby('Provider')['0'].nunique()
unique_procedure = new_claims.groupby('Provider')[0].nunique()
unique_procedure = unique_procedure.to_frame().reset_index().rename(columns={0: "unique_procedure"})
#fraud_diagnoses = pd.Series(diagnosis_df.to_numpy().flatten()).dropna()
unique_procedure.head()
new_claims = claims.loc[claims.In == 1, procedureP]
#new_claims = claims[diagnosisP]
new_claims.set_index('Provider', inplace=True)
new_claims = pd.concat([new_claims[col] for col in new_claims])
new_claims = new_claims.to_frame().dropna().reset_index()
unique_procedure_In = new_claims.groupby('Provider')[0].nunique()
unique_procedure_In = unique_procedure_In.to_frame().reset_index().rename(columns={0: "unique_procedure_In"})
unique_procedure_In.head()
new_claims = claims.loc[claims.Out == 1, procedureP]
#new_claims = claims[diagnosisP]
new_claims.set_index('Provider', inplace=True)
new_claims = pd.concat([new_claims[col] for col in new_claims])
new_claims = new_claims.to_frame().dropna().reset_index()
unique_procedure_Out = new_claims.groupby('Provider')[0].nunique()
unique_procedure_Out = unique_procedure_Out.to_frame().reset_index().rename(columns={0: "unique_procedure_Out"})
unique_procedure_Out.head()
def groupby_provider(col, func, freq = 'M'):
temp = claims[['ClaimStartDt', 'Provider', col]]
temp.set_index('ClaimStartDt', inplace = True)
group = pd.DataFrame(temp.groupby([pd.Grouper(freq='M'), 'Provider']).agg(func))
group.reset_index(inplace = True)
group = group.pivot(index = 'Provider', columns = 'ClaimStartDt', values = col)
group.columns = group.columns.strftime('%Y-%m-%d')
return group
# average number of inpatient days per month
diagnosis_count = groupby_provider('diagnosis_count', 'mean')
diagnosis_count = diagnosis_count.add_prefix('Avg')
diagnosis_count.fillna(0, inplace = True)
diagnosis_count.head()
diagnosis_count_Std = pd.DataFrame(diagnosis_count.std(axis = 1), columns = ['Diagnosis_Count_Std'])
diagnosis_count_Std.head()
diagnosis_count_mean = pd.DataFrame(diagnosis_count.mean(axis = 1), columns = ['Diagnosis_Count_Mean'])
diagnosis_count_mean.head()
# average number of Admit code permonth
procedure_count = groupby_provider('procedure_count', 'mean')
procedure_count = procedure_count.add_prefix('InDays')
procedure_count.fillna(0, inplace = True)
procedure_count.head()
procedure_count_Std = pd.DataFrame(procedure_count.std(axis = 1), columns = ['procedure_count_Std'])
procedure_count_Std.head()
procedure_count_mean = pd.DataFrame(procedure_count.mean(axis = 1), columns = ['procedure_count_Mean'])
procedure_count_mean.head()
# average number of inpatient days per month
admit_unique = groupby_provider('ClmAdmitDiagnosisCode', pd.Series.nunique)
admit_unique = admit_unique.add_prefix('InDays')
admit_unique.fillna(0, inplace = True)
admit_unique.head()
admit_unique_Std = pd.DataFrame(admit_unique.std(axis = 1), columns = ['Admit_unique_Std'])
admit_unique_Std.head()
admit_unique_mean = pd.DataFrame(admit_unique.mean(axis = 1), columns = ['Admit_unique_Mean'])
admit_unique_mean.head()
# average number of inpatient days per month
DRG_unique = groupby_provider('DiagnosisGroupCode', pd.Series.nunique)
DRG_unique = DRG_unique.add_prefix('InDays')
DRG_unique.fillna(0, inplace = True)
DRG_unique.head()
DRG_unique_Std = pd.DataFrame(DRG_unique.std(axis = 1), columns = ['DRG_unique_Std'])
DRG_unique_Std.head()
DRG_unique_mean = pd.DataFrame(DRG_unique.mean(axis = 1), columns = ['DRG_unique_Mean'])
DRG_unique_mean.head()
#merge(unique_diagnosis, unique_diagnosis_In, unique_diagnosis_Out,
#unique_procedure, unique_procedure_Out, unique_procedure_In,
#diagnosis_count_Std, diagnosis_count_mean,
#procedure_count_Std, procedure_count_mean)
Providers = unique_diagnosis.merge(
unique_diagnosis_In,
on = 'Provider' ,
how='outer').fillna(0).merge(
unique_diagnosis_Out,
on = "Provider",
how='outer').fillna(0).merge(
unique_procedure,
on = "Provider",
how='outer').fillna(0).merge(
unique_procedure_In,
on = "Provider",
how='outer').fillna(0).merge(
unique_procedure_Out,
on = "Provider",
how='outer').fillna(0).merge(
diagnosis_count_Std,
on = "Provider",
how='outer').fillna(0).merge(
diagnosis_count_mean,
on = "Provider",
how='outer').fillna(0).merge(
procedure_count_Std,
on = "Provider",
how='outer').fillna(0).merge(
procedure_count_mean,
on = "Provider",
how='outer').fillna(0).merge(
unique_admitcode,
on = "Provider",
how='outer').fillna(0).merge(
unique_DRG,
on = "Provider",
how='outer').fillna(0).merge(
admit_unique_Std,
on = "Provider",
how='outer').fillna(0).merge(
admit_unique_mean,
on = "Provider",
how='outer').fillna(0).merge(
DRG_unique_Std,
on = "Provider",
how='outer').fillna(0).merge(
DRG_unique_mean,
on = "Provider",
how='outer').fillna(0)
Providers.sort_values('unique_diagnosis').head(5)
Providers.shape
Providers.to_csv('Codes.csv')
# top 10 diagnoses: Admit
diagnosis_df = claims['ClmAdmitDiagnosisCode']
fraud_diagnoses = pd.Series(diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:20])
plt.xlabel('ICD9 Code')
plt.title('Top Admit Diagnosis Code')
plt.show()
df = pd.concat([claims['ClmAdmitDiagnosisCode'].value_counts(),
claims['ClmAdmitDiagnosisCode'].value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage'))
df['percentage_running_total'] = df['percentage'].cumsum()
df = df.reset_index()
df['number'] = np.arange(len(df))
df.plot(x='number', y='percentage_running_total', kind='line',
figsize=(10, 8), legend=False, style='mo-')
# top 10 diagnoses involved with fraud
fraud_diagnosis_df = claims.loc[claims.PotentialFraud == 1, 'ClmAdmitDiagnosisCode']
fraud_diagnoses = pd.Series(fraud_diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:10], palette = 'RdBu')
plt.xlabel('ICD9 Code')
plt.title('Top 10 Admit Diagnoses Involved with Potential Fraud')
plt.show()
fraud_diagnoses.unique().shape
# top 10 diagnoses: Claim
diagnosis_df = claims[diagnosis]
fraud_diagnoses = pd.Series(diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:20])
plt.xlabel('ICD9 Code')
plt.title('Top Claim Diagnosis Code')
plt.show()
df = pd.concat([fraud_diagnoses.value_counts(),
fraud_diagnoses.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage'))
df['percentage_running_total'] = df['percentage'].cumsum()
df = df.reset_index()
df['number'] = np.arange(len(df))
df.plot(x='number', y='percentage_running_total', kind='line',
figsize=(10, 8), legend=False, style='mo-')
# top 10 diagnoses involved with fraud
fraud_diagnosis_df = claims.loc[claims.PotentialFraud == 1, diagnosis]
fraud_diagnoses = pd.Series(fraud_diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:10], palette = 'RdBu')
plt.xlabel('ICD9 Code')
plt.title('Top 10 Diagnoses Involved with Potential Fraud')
plt.show()
fraud_diagnoses.unique().shape
# top 10 diagnoses: Claim
diagnosis_df = claims['DiagnosisGroupCode']
fraud_diagnoses = pd.Series(diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:20])
plt.xlabel('DRG Code')
plt.title('Top Claim Diagnosis Code')
plt.show()
df = pd.concat([claims['DiagnosisGroupCode'].value_counts(),
claims['DiagnosisGroupCode'].value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage'))
df['percentage_running_total'] = df['percentage'].cumsum()
df = df.reset_index()
df['number'] = np.arange(len(df))
df.plot(x='number', y='percentage_running_total', kind='line',
figsize=(10, 8), legend=False, style='mo-')
# top 10 diagnoses involved with fraud
fraud_diagnosis_df = claims.loc[claims.PotentialFraud == 1,'DiagnosisGroupCode']
fraud_diagnoses = pd.Series(fraud_diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:10], palette = 'RdBu')
plt.xlabel('DRG Code')
plt.title('Top 10 Diagnosis Group Code Involved with Potential Fraud')
plt.show()
fraud_diagnoses.unique().shape
# top 10 diagnoses: Claim
diagnosis_df = claims[procedure]
fraud_diagnoses = pd.Series(diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:20])
plt.xlabel('CPT Code')
plt.title('Top Procedure Code')
plt.show()
df = pd.concat([fraud_diagnoses.value_counts(),
fraud_diagnoses.value_counts(normalize=True).mul(100)],axis=1, keys=('counts','percentage'))
df['percentage_running_total'] = df['percentage'].cumsum()
df = df.reset_index()
df['number'] = np.arange(len(df))
df.plot(x='number', y='percentage_running_total', kind='line',
figsize=(10, 8), legend=False, style='mo-')
# top 10 diagnoses involved with fraud
fraud_diagnosis_df = claims.loc[claims.PotentialFraud == 1, procedure]
fraud_diagnoses = pd.Series(fraud_diagnosis_df.to_numpy().flatten()).dropna()
fig = plt.figure(figsize = (12, 6))
sns.countplot(x = fraud_diagnoses, order = fraud_diagnoses.value_counts().index[:10], palette = 'RdBu')
plt.xlabel('CPT Code')
plt.title('Top 10 Procedure Code Involved with Potential Fraud')
plt.show()
# Load the packages
import numpy as np
import pandas as pd
import math
from scipy import stats
import time
from itertools import cycle
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from pyod.models.abod import ABOD
from pyod.models.iforest import IForest
from pyod.models.lof import LOF
from pyod.models.pca import PCA
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report, precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
SEED = 42
# # load the data
# demo = pd.read_csv('demo_agg.csv', index_col = 0)
# claim = pd.read_csv('claims1.csv')
# code = pd.read_csv('Codes.csv', index_col = 0)
# print(demo.shape)
# print(claim.shape)
# print(code.shape)
# df.columns
# # merge
# claim.drop('PotentialFraud', axis = 1, inplace = True)
# df = claim.merge(demo, how = 'left', on = 'Provider')
# df = df.merge(code, how = 'left', on = 'Provider')
# df.shape
# df.columns
# df.to_csv('merge.csv', index = False)
# load the data
df = pd.read_csv('merge.csv')
# set provider as index
df.set_index('Provider', inplace = True)
df.shape
X = df.drop('PotentialFraud', axis = 1)
y = df.PotentialFraud
# check missing values
def check_missing(df):
missing = df.isnull().sum()
missing_percentage = (df.isnull().sum()/len(df)*100).round(2)
missing_val = pd.concat([missing, missing_percentage], axis = 1)
missing_val.columns = ['Missing Values', '% Missing']
total_columns = df.shape[1]
missing_columns = (df.isnull().sum() > 0).sum()
print('Out of {} columns, {} columns have missing values'.format(total_columns, missing_columns))
return missing_val
check_missing(X)
y.value_counts(normalize = True)
tsne = TSNE(random_state = SEED)
Tsne_transformed = tsne.fit_transform(X)
sns.set_style("white")
plt.figure(figsize = (8, 6))
xs = Tsne_transformed[:,0]
ys = Tsne_transformed[:,1]
sns.scatterplot(xs, ys, hue=y, palette = 'RdBu')
plt.title('t-SNE Visualization', fontsize = 15)
plt.legend()
plt.show()
corr = df.corr()
fig, ax = plt.subplots(figsize = (50, 50))
sns.heatmap(corr, annot = True, square = True)
ax.set_ylim(len(corr)+0, -0)
plt.show()
# features most correlated with the target variable:
fraud_corr = corr['PotentialFraud'].sort_values(ascending = False)
relevant_features = fraud_corr[:11].index.tolist()
fig, ax = plt.subplots(figsize = (10, 10))
fraud_corr_matrix = df[relevant_features].corr()
sns.heatmap(fraud_corr_matrix, annot = True, square = True)
ax.set_ylim(len(fraud_corr_matrix)+0, -0)
plt.show()
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = SEED)
# define which resampling method and which ML model to use in the pipeline
scaler = StandardScaler()
resampling = ADASYN(random_state = SEED, sampling_strategy = 1.0)
model = RandomForestClassifier(random_state = SEED)
# define the pipeline, tell it to combine ADASYN with the Logistic Regression model
pipeline = Pipeline([('Scaler', scaler), ('ADASYN', resampling), ('Classifier', model)])
# fit the data and get the train score
pipeline = pipeline.fit(X_train, y_train)
# features and feature importances
sns.set()
fig = plt.figure(figsize = (16, 8))
features = X_train.columns
importances = pipeline.steps[2][1].feature_importances_
indices = np.argsort(importances)[::-1]
names = [features[i] for i in indices]
plt.title("Feature Importance", fontsize = 15)
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), names, rotation=90)
plt.show()
Linear Models for Outlier Detection:
Proximity-Based Outlier Detection Models:
Probabilistic Models for Outlier Detection:
Outlier Ensembles and Combination Frameworks:
# Define the outliers_fraction
outliers_fraction = y.value_counts(normalize = True)[1]
clusters_separation = [0]
# scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Define four outlier detection tools to be compared
classifiers = {
'Angle-based Outlier Detector (ABOD)':
ABOD(n_neighbors = 20, contamination=outliers_fraction),
'Isolation Forest': IForest(n_estimators = 500, bootstrap = True, contamination=outliers_fraction,
random_state=SEED),
'Local Outlier Factor (LOF)':
LOF(n_neighbors = 20, contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=SEED)
}
train_precision = []
train_recall = []
train_f1 = []
test_precision = []
test_recall = []
test_f1 = []
for i, (clf_name, clf) in enumerate(classifiers.items()) :
# fit the dataset to the model
clf.fit(X_train_scaled)
y_train_pred = clf.labels_
y_train_scores = clf.decision_scores_
# predict raw anomaly score
y_test_scores = clf.decision_function(X_test_scaled)
# prediction of a datapoint category outlier or inlier
y_test_pred = clf.predict(X_test_scaled)
# no of errors in prediction
n_errors = (y_test_pred != y_test).sum()
print('No of Errors : ', clf_name, n_errors)
# classification report
print(classification_report(y_test, y_test_pred))
# precision, recall, f1
train_precision.append(precision_score(y_train, y_train_pred))
train_recall.append(recall_score(y_train, y_train_pred))
train_f1.append(f1_score(y_train, y_train_pred))
test_precision.append(precision_score(y_test, y_test_pred))
test_recall.append(recall_score(y_test, y_test_pred))
test_f1.append(f1_score(y_test, y_test_pred))
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
# visualize the results
visualize(clf_name, X_train_scaled[:,[0,5]], y_train, X_test_scaled[:,[0,5]], y_test, y_train_pred,
y_test_pred, show_figure=True, save_figure=False)
print('-'*80)
anomaly_models = ['Angle-based Outlier Detector (ABOD)',
'Isolation Forest',
'Local Outlier Factor (LOF)',
'Principal Component Analysis (PCA)']
anomaly_score = pd.DataFrame({'Train Precision': train_precision, 'Train Recall': train_recall, 'Train F1': train_f1, \
'Test Precision': test_precision, 'Test Recall': test_recall, 'Test F1': test_f1}, index = anomaly_models)
anomaly_score.sort_values(by = 'Test F1', ascending = False)
anomaly_sorted_score = anomaly_score.sort_values(by = 'Test F1', ascending = False)
anomaly_sorted_score.plot.bar(rot=90,colormap = 'Set3', figsize=(18,6))
plt.title('Unsupervised Anomaly Detection Models Evaluation Scores', fontsize = 15)
plt.show()
from sklearn.decomposition import PCA
# perform PCA to reduce the dimension for clustering
# Create scaler: scaler
scaler = StandardScaler()
# Create a PCA instance: pca
pca = PCA()
# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)
# Fit the pipeline to the samples
pipeline.fit(X_train)
# Plot the explained variances ratio
features = range(pca.n_components_)
plt.plot(features, pca.explained_variance_ratio_.cumsum())
plt.hlines(y = 0.95, xmin= 0 , xmax = len(features), color = 'r', linestyles = 'dashed')
plt.xlabel('PCA feature')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Importances')
plt.show()
# find number of features for cumulative importance of 95%
print('Number of features for 95% importance:', np.where(pca.explained_variance_ratio_.cumsum() > 0.95)[0][0])
from sklearn.decomposition import PCA
train_score = []
test_score = []
training_time = []
def build_model(input_model, model_name):
# define which resampling method and which ML model to use in the pipeline
scaler = StandardScaler()
pca = PCA(n_components = 25)
resampling = ADASYN(random_state = SEED, sampling_strategy = 1.0)
model = input_model
# define the pipeline, tell it to combine ADASYN with the Logistic Regression model
pipeline = Pipeline([('Scaler', scaler), ('PCA', pca), ('ADASYN', resampling), ('Classifier', model)])
# fit the data and get the train score
start = time.time()
pipeline.fit(X_train, y_train)
stop = time.time()
y_pred_train = pipeline.predict(X_train)
train_score.append(f1_score(y_train, y_pred_train))
# training time
train_time = stop - start
training_time.append(train_time)
# predict
y_pred = pipeline.predict(X_test)
test_score.append(f1_score(y_test, y_pred))
return pipeline
nb = build_model(GaussianNB(), 'Naive Bayes')
logreg = build_model(LogisticRegression(random_state = SEED), 'Logistic Regression')
knn = build_model(KNeighborsClassifier(), 'KNN')
svc = build_model(SVC(random_state = SEED), 'Support Vector Classifier')
dt = build_model(DecisionTreeClassifier(max_depth = 10, random_state = SEED), 'Decision Tree')
rf = build_model(RandomForestClassifier(n_estimators = 100, max_depth = 10, random_state = SEED), 'Random Forest')
xgb = build_model(XGBClassifier(n_estimators = 100, max_depth = 10,random_state = SEED), 'XGBoost')
# take a look at the scores and go back to tune the hyperparameters if see any indication of overfitting or underfitting
classifiers = ['Naive Bayes', 'Logistic Regression', 'KNN', 'SVM', 'Decision Tree', 'Random Forest', 'XGBoost']
result = pd.DataFrame({'Train F1': train_score, 'Test F1': test_score, 'Training Time': training_time}, index = classifiers)
result.sort_values(by = 'Test F1', ascending = False)
# visualize the metrics
base_score = result.iloc[:, 0:2].sort_values(by = 'Test F1', ascending = False)
fig = plt.figure()
base_score.plot.bar(rot=0, colormap = 'Set3', figsize=(12,6))
plt.title('Baseline Model F1 Scores', fontsize = 15)
plt.show()
# visualize the training time
base_time = result.iloc[:, 2].sort_values()
fig = plt.figure()
base_time.plot.bar(rot=0, colormap = 'Set3', figsize=(12, 6))
plt.title('Baseline Model Training Time', fontsize = 15)
plt.show()
# visualize the classification report
def v_clfreport(model, title):
y_pred = model.predict(X_test)
clf_report = classification_report(y_test, y_pred, output_dict=True)
df = pd.DataFrame(clf_report).iloc[:-1, :2].T
fig, ax = plt.subplots(figsize = (6, 2))
sns.heatmap(df, annot=True)
plt.title(title)
ax.set_ylim(len(df)+0, -0)
plt.show()
# visualize classification report on the test set
v_clfreport(nb, 'Baseline Model Naive Bayes Classification Report')
v_clfreport(logreg, 'Baseline Model Logistic Regression Classification Report')
v_clfreport(xgb, 'Baseline Model XGBoost Classification Report')
v_clfreport(knn, 'Baseline Model KNN Classification Report')
v_clfreport(rf, 'Baseline Model Random Forest Classification Report')
v_clfreport(svc, 'Baseline Model SVM Classification Report')
v_clfreport(dt, 'Baseline Model Decision Tree Classification Report')
def build_tune_model(input_model, param_dist):
# define which resampling method and which ML model to use in the pipeline
scaler = StandardScaler()
pca = PCA(n_components = 25)
resampling = ADASYN(random_state = SEED, sampling_strategy = 1.0)
model = input_model
# define the pipeline, tell it to combine ADASYN with the Logistic Regression model
pipeline = Pipeline([('Scaler', scaler), ('PCA', pca), ('ADASYN', resampling), ('Classifier', model)])
# stratified k fold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
# tune hyperparameter
pipeline = RandomizedSearchCV(pipeline, param_distributions = param_dist, scoring = 'f1', cv = kfold, n_iter = 10, n_jobs = -1, random_state = SEED)
# fit the data
pipeline.fit(X_train, y_train)
pipeline = pipeline.best_estimator_
return pipeline
# define param_dist
logreg_param = {'Classifier__C' : np.logspace(-4, 4, 20)}
knn_param = {'Classifier__n_neighbors': range(5, 10)}
svc_param = {'Classifier__C' : np.logspace(-4, 4, 20),
'Classifier__gamma': np.logspace(-4, 4, 20)}
dt_param = {'Classifier__max_depth': range(3, 5),
'Classifier__max_features': range(1, 12),
'Classifier__min_samples_leaf': range(5, 10),
'Classifier__criterion': ["gini", "entropy"]}
rf_param = {'Classifier__n_estimators': range(400, 1000, 10),
'Classifier__max_features': ['auto', 'sqrt'],
'Classifier__max_depth': range(3, 7),
'Classifier__min_samples_split': range(2, 8),
'Classifier__min_samples_leaf': range(3, 10)}
xgb_param = {'Classifier__n_estimators': range(20, 200, 10),
'Classifier__learning_rate': [0.0001, 0.001, 0.01, 1, 2, 5],
'Classifier__max_depth': range(3, 5),
'Classifier__min_samples_split': range(2, 10),
'Classifier__min_samples_leaf':range(5, 10),
'Classifier__max_features': range(3, 12),
'Classifier__subsample': [0.6, 0.7, 0.75, 0.8]}
logreg1 = build_tune_model(LogisticRegression(random_state = SEED), logreg_param)
knn1 = build_tune_model(KNeighborsClassifier(), knn_param)
svc1 = build_tune_model(SVC(random_state = SEED), svc_param)
dt1 = build_tune_model(DecisionTreeClassifier(random_state = SEED), dt_param)
rf1 = build_tune_model(RandomForestClassifier(random_state = SEED), rf_param)
xgb1 = build_tune_model(XGBClassifier(n_iter_no_change=5, random_state = SEED), xgb_param)
train_score_1 = []
cv_score_1 = []
test_precision_1 = []
test_recall_1 = []
test_f1_1 = []
def score(model):
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_f1 = f1_score(y_train, y_train_pred)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
cv_f1 = np.mean(cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1', n_jobs=-1))
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
train_score_1.append(train_f1)
cv_score_1.append(cv_f1)
test_precision_1.append(test_precision)
test_recall_1.append(test_recall)
test_f1_1.append(test_f1)
score(nb)
score(logreg1)
score(knn1)
score(svc1)
score(dt1)
score(rf1)
score(xgb1)
# take a look at the scores and go back to tune the hyperparameters if see any indication of overfitting or underfitting
classifiers = ['Naive Bayes', 'Logistic Regression', 'KNN', 'SVM', 'Decision Tree', 'Random Forest', 'XGBoost']
result1 = pd.DataFrame({'Train F1': train_score_1, 'CV F1': cv_score_1, 'Test Precision': test_precision_1, \
'Test Recall': test_recall_1, 'Test F1': test_f1_1}, index = classifiers)
result1.sort_values(by = 'Test F1', ascending = False)
# visualize the metrics
tuned_score = result1.sort_values(by = 'Test F1', ascending = False)
fig = plt.figure()
tuned_score.plot.bar(rot=0, colormap = 'Set3', figsize=(12,6))
plt.title('Final Models Scores', fontsize = 15)
plt.show()
# visualize classification report on the test set
v_clfreport(logreg1, 'Logistic Regression Classification Report')
v_clfreport(xgb1, 'XGBoost Classification Report')
v_clfreport(knn1, 'KNN Classification Report')
v_clfreport(rf1, 'Random Forest Classification Report')
v_clfreport(svc1, 'SVM Classification Report')
v_clfreport(dt1, 'Decision Tree Classification Report')
# final model -> logreg
logreg1.steps[3][1].get_params()
# plot the precision recall curve
y_pred_proba = logreg1.predict_proba(X_test)
thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal', 'red', 'yellow', 'green', 'blue','black'])
plt.figure(figsize=(10,10))
j = 1
for i,color in zip(thresholds,colors):
y_test_prob = y_pred_proba[:,1] > i
precision, recall, thresholds = precision_recall_curve(y_test, y_test_prob)
# Plot Precision-Recall curve
plt.plot(recall, precision, color=color,
label='Threshold: %s'%i)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Logistic Regression Precision-Recall Curve', fontsize = 15)
plt.legend(loc="lower left")
baseline = len(y_test[y_test==1]) / len(y_test)
plt.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
plt.legend(loc="lower left")
plt.show()
figure, ax = plt.subplots(figsize=(10,10))
plot_precision_recall_curve(logreg1, X_test, y_test, name = 'Logistic Regression', ax = ax)
plot_precision_recall_curve(xgb1, X_test, y_test, name = 'XGBoost', ax = ax)
plot_precision_recall_curve(rf1, X_test, y_test, name = 'Random Forest', ax = ax)
baseline = len(y_test[y_test==1]) / len(y_test)
plt.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve', fontsize = 15)
plt.legend(loc="lower left")
plt.show()
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import h5py
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]
df = pd.read_csv("C:/Users/Carrie/Desktop/MScA Courses/HealthAnalytics/FinalProject/input/merge.csv")
print(df.shape)
df.head()
df.isnull().values.any()
df.drop('Provider', axis=1, inplace=True)
frauds = df[df.PotentialFraud == 1]
normal = df[df.PotentialFraud == 0]
frauds.shape
normal.shape
Training an autoencoder for anomalies detection is done using only normal data. Then any new observations that cannot be well predicted by such autoencoder are anomalies.
Such approach has two important benefits in comparison to classifiers:
Reserve 20% of the data for testing. Parameter stratify helps keeping the class size ratio within each data set.
X_train_split, X_test_split = train_test_split(df, test_size=0.2,
stratify=df['PotentialFraud'],
random_state=RANDOM_SEED)
Extract the labels from train and test data.
y_train = X_train_split['PotentialFraud']
X_train = X_train_split.drop(['PotentialFraud'], axis=1)
y_test = X_test_split['PotentialFraud']
X_test = X_test_split.drop(['PotentialFraud'], axis=1)
print('Train: shape X',X_train.shape,', shape Y',y_train.shape)
print('Test: shape X',X_test.shape,', shape Y',y_test.shape)
Select the "normal" transactions from the training data to train autoencoder on them.
X_trainNorm = X_train[y_train == 0]
X_trainNorm_val = X_trainNorm.values # Only values, axis labels removed. This is input for the Autoencoder
X_testNorm_val = X_test[y_test == 0].values # The validation data
print(y_train.shape)
print(X_train.shape)
X_trainNorm_val.shape
This autoencoder contains fully connected layers with 14, 7, 14 and 29 neurons, respectively. The first two layers make encoder, the last two make decoder.
Training is done with $L_1$ regularization.
Set parameters:
input_dim = X_trainNorm_val.shape[1]
layer1_dim = 14
encoder_dim = 7
Create tensors:
input_layer = Input(shape=(input_dim, ))
encoder1 = Dense(layer1_dim, activation="tanh",
activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder2 = Dense(encoder_dim, activation="relu")(encoder1)
decoder1 = Dense(layer1_dim, activation='tanh')(encoder2)
decoder2 = Dense(input_dim, activation='linear')(decoder1)
print('input_layer: ',input_layer)
print('encoder1',encoder1)
print('encoder2',encoder2)
print('decoder1',decoder1)
print('decoder2',decoder2)
Create autoencoder from the tensors:
autoencoder = Model(inputs=input_layer, outputs=decoder2)
autoencoder.summary()
Train the model for 100 epochs with a batch size of 32 samples and save the best performing model to a file using ModelCheckpoint provided by Keras.
Use the EarlyStopping callback that stops training when a monitored quantity has stopped improving.
%%time
nb_epoch = 1000000
batch_size = 32
autoencoder.compile(optimizer='adam',
loss='mean_squared_error')
checkpointer = ModelCheckpoint(filepath="model.h5",
verbose=0,
save_best_only=True)
earlystopping = EarlyStopping(monitor='val_loss', patience=1, verbose=0) # 'patience' number of not improving epochs
history = autoencoder.fit(X_trainNorm_val, X_trainNorm_val,
epochs=nb_epoch,
batch_size=batch_size,
shuffle=True,
validation_data=(X_testNorm_val, X_testNorm_val),
verbose=1,
callbacks=[checkpointer, #tensorboard,
earlystopping]).history
Load the model saved by checkpointer.
autoencoder = load_model('model.h5')
Visualize the learning process.
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right');
The reconstruction errors on both train and test data converge nicely.
Calculate predictions by the autoencoder:
testPredictions = autoencoder.predict(X_test)
X_test.shape,testPredictions.shape
Calculate mean squared error.
testMSE = mean_squared_error(X_test.transpose(), testPredictions.transpose(),
multioutput='raw_values')
error_df = pd.DataFrame({'reconstruction_error': testMSE,'true_class': y_test})
error_df.head()
error_df.shape
error_df.reconstruction_error.describe()
error_df.reconstruction_error.quantile(0.9)
Plot histogram of errors of reconstruction for normal data.
len(normal_error_df.reconstruction_error.values)
fig = plt.figure()
ax = fig.add_subplot(111)
normal_error_df = error_df[(error_df['true_class']== 0) & (error_df['reconstruction_error'] < 1089669752.125486)]
sns.distplot(normal_error_df.reconstruction_error.values)
plt.title('Normal Transactions', fontsize=15)
plt.xlabel('Reconstruction Errors', fontsize=15)
Compare distribution of reconstruction error for normal data and for fraudulent data.
fig = plt.figure()
ax = fig.add_subplot(111)
fraud_error_df = error_df[error_df['true_class'] == 1]
sns.distplot(fraud_error_df.reconstruction_error.values)
plt.title('Fraudulent Transactions', fontsize=15)
plt.xlabel('Reconstruction Errors', fontsize=15)
Errors of reconstruction of fraudulent data are much larger.
from sklearn.metrics import (confusion_matrix, auc, roc_curve, cohen_kappa_score, accuracy_score)
Calculate ROC curve and AUC:
fpr, tpr, thresholds = roc_curve(error_df.true_class, error_df.reconstruction_error)
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();
AUC for the autoencoder predictions is pretty high.
To detect fraud based on prediction MSE select a high level quantile of the MSE sample that guarantees probability of type 1 error. Select 0.995 quantile of the reconstruction error for "normal" class.
threshold = normal_error_df.reconstruction_error.quantile(q=0.995)
threshold
Plot all errors, normal and fraud cases marked, and the threshold:
groups = error_df.groupby('true_class')
fig, ax = plt.subplots()
for name, group in groups:
if name == 1:
MarkerSize = 7
Color = 'orangered'
Label = 'Fraud'
Marker = 'd'
else:
MarkerSize = 3.5
Color = 'b'
Label = 'Normal'
Marker = 'o'
ax.plot(group.index, group.reconstruction_error,
linestyle='',
color=Color,
label=Label,
ms=MarkerSize,
marker=Marker)
ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend(loc='upper left', bbox_to_anchor=(0.95, 1))
# plt.title("Probabilities of fraud for different classes", fontsize=15)
plt.ylabel("Reconstruction Error", fontsize=15)
plt.xlabel("Data point index", fontsize=15)
plt.show();
To draw more precise conclusions calculate confusion matrix:
y_pred = [1 if e > threshold else 0 for e in error_df.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df.true_class, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix", fontsize=15)
plt.ylabel('True class', fontsize=15)
plt.xlabel('Predicted class', fontsize=15)
plt.show()
cohen_kappa_score(error_df.true_class, y_pred),accuracy_score(error_df.true_class, y_pred)
The model manages to detect many fraudulent cases with accuracy over 93%. However, because of extreme prevalence Cohen's kappa gives a much more conservative evaluation.
# load the package
from datetime import date
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn import preprocessing
import networkx as nx
from itertools import permutations
from itertools import count
plt.rcParams['figure.figsize'] = (8, 6)
# load the data
inpatient = pd.read_csv('../Train_Inpatientdata-1542865627584.csv')
outpatient = pd.read_csv('../Train_Outpatientdata-1542865627584.csv')
fraud = pd.read_csv('../Train-1542865627584.csv', index_col = 'Provider')
claims = inpatient.append(outpatient)
claims = claims.merge(fraud, how = 'left', on = 'Provider')
claims.head()
claims = claims[['Provider', 'BeneID', 'OperatingPhysician',
'AttendingPhysician', 'OtherPhysician','PotentialFraud']]
claims.head()
claims.shape
claims
Based on Provider and Patient
ProviderRel = claims[['Provider','BeneID','PotentialFraud' ]]
ProviderRel.head()
ProviderRel.shape
Similar = claims[['Provider','BeneID']]
#[claims['PotentialFraud'] == 'Yes']
Similar = Similar.drop_duplicates()
Similar.head()
Similar.shape
top = Similar['BeneID'].value_counts().head(300)
top.head()
top.tail()
#Similar['BeneID']
Similar = Similar[Similar.BeneID.isin(top.index)]
Similar.shape
ss = Similar.groupby('Provider')
count_dct = Similar.groupby('Provider').count().to_dict()
count_dct = count_dct.values()
unique_grp = Similar['Provider'].unique() # get the unique groups
unique_atr = Similar['BeneID'].unique() # get the unique attributes
count_dct1 = Similar.groupby('Provider').count()
print(unique_grp.shape)
print(unique_atr.shape)
combos = list(permutations(unique_grp, 2)) # get all combinations of the groups
comp_df = pd.DataFrame(data = (combos), columns = ['Group','LikeGroup']) # create the array to put comparison data into
comp_df['CommonWords'] = 0
i = 1
for atr in unique_atr:
print("i step: " + str(i))
i = i + 1
temp_df = Similar[Similar['BeneID'] == atr] # break dataframe into pieces that only contain the attribute being looked at during that iteration
myl = list(permutations(temp_df['Provider'],2)) # returns the pairs that have the attribute in common as a tuple
j = 0
for comb in myl:
comp_df.loc[(comp_df['Group'] == comb[0]) & (comp_df['LikeGroup'] == comb[1]), 'CommonWords'] += 1 # increments the CommonWords column where the Group column is equal to the first entry in the previously mentioned tuple, and the LikeGroup column is equal to the second entry.
k = 0
print('i OVER')
#for key, val in count_dct.items(): # put the previously computed TotalCount into the comparison dataframe
# print('k step:' + str(k))
# comp_df.loc[comp_df['Provider'] == key, 'TotalCount'] = val
#comp_df['PCT'] = (comp_df['CommonWords'] * 100.0 / comp_df['TotalCount']).round()
comp_df1 = comp_df
comp_df1 = comp_df1.merge(fraud, how = 'left', left_on='Group', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"PotentialFraud": "GroupFraud"})
comp_df1 = comp_df1.merge(fraud, how = 'left', left_on='LikeGroup', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"PotentialFraud": "LikeGroupFraud"})
comp_df1 = comp_df1.merge(count_dct1, how = 'left', left_on='Group', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"BeneID": "TotalGroup"})
comp_df1 = comp_df1.merge(count_dct1, how = 'left', left_on='LikeGroup', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"BeneID": "TotalLikeGroup"})
comp_df1['PCT'] = (comp_df1['CommonWords'] * 100.0 / comp_df1['TotalGroup']).round()
comp_df1.to_csv('BenProvData.csv')
comp_df1.sort_values('CommonWords', ascending=False).head(20)
Groups = comp_df1.sort_values('CommonWords', ascending=False).head(20)
Groups['Group'].unique()
Groups['LikeGroup'].unique()
newGroup = ProviderRel[(ProviderRel['Provider'].isin(Groups['Group'].unique())) |
(ProviderRel['Provider'].isin(Groups['LikeGroup'].unique()))].drop_duplicates()
newGroup.head()
newGroup.shape
FGa = nx.from_pandas_edgelist(newGroup, source='Provider',
target='BeneID')
fraud1 = fraud
fraud1 = fraud1.reset_index()
fraud1 = fraud1[fraud1['Provider'].isin(newGroup.Provider)].set_index('Provider')
nx.set_node_attributes(FGa, name = 'Fraud', values = pd.Series(fraud1.PotentialFraud).to_dict())
#nx.set_node_attributes(FGa, 'gender', pd.Series(nodes.gender, index=nodes.node).to_dict())
fraud1 = fraud
fraud1 = fraud1.reset_index()
fraud1 = fraud1[fraud1['Provider'].isin(newGroup.Provider)]
Ben = newGroup[['BeneID','PotentialFraud']]
Ben.PotentialFraud = 'Ben'
Ben = Ben.drop_duplicates()
Ben = Ben.rename(columns={"BeneID": "Provider"})
fraud2 = fraud1.append(Ben)
fraud2 = fraud2.set_index('Provider')
fraud2.head(20)
nx.set_node_attributes(FGa, name = 'Fraud', values = pd.Series(fraud2.PotentialFraud).to_dict())
#nx.set_node_attributes(FGa, 'gender', pd.Series(nodes.gender, index=nodes.node).to_dict())
pd.Series(fraud2.PotentialFraud)
#.to_dict()
nx.get_node_attributes(FGa, 'Fraud')
len(FGa)
print(nx.info(FGa))
#FGa.nodes()
FGa.node[n]['Fraud']
from itertools import count
plt.figure(figsize=(20,20))
# get unique groups
groups = set(nx.get_node_attributes(FGa,'Fraud').values())
mapping = dict(zip(sorted(groups),count()))
nodes = FGa.nodes()
colors = [mapping[FGa.node[n]['Fraud']] for n in nodes]
# drawing nodes and edges separately so we can capture collection for colobar
pos = nx.spring_layout(FGa)
ec = nx.draw_networkx_edges(FGa, pos, alpha=0.05)
nc = nx.draw_networkx_nodes(FGa, pos, nodelist=nodes, node_color=colors,
with_labels=False, node_size=50, cmap=plt.cm.jet)
#plt.colorbar(nc)
plt.axis('off')
plt.show()
plt.figure(figsize=(20,20))
nx.draw(FGa,
with_labels = False,
font_size = 5,
nodelist=nodes,
node_color=colors,
node_size = 200
)
plt.show()
plt.figure(figsize=(10,8))
nx.draw(FGa,
with_labels = False,
font_size = 5,
#node_color=['red', 'blue'],
node_size = 10
)
plt.show()
Similar = claims[['Provider','AttendingPhysician']]
#[claims['PotentialFraud'] == 'Yes']
Similar = Similar.drop_duplicates()
Similar.head()
Similar.shape
top = Similar['AttendingPhysician'].value_counts().head(2000)
top.head()
top.tail()
#Similar['BeneID']
Similar = Similar[Similar.AttendingPhysician.isin(top.index)]
Similar.shape
ss = Similar.groupby('Provider')
count_dct = Similar.groupby('Provider').count().to_dict()
count_dct = count_dct.values()
unique_grp = Similar['Provider'].unique() # get the unique groups
unique_atr = Similar['AttendingPhysician'].unique() # get the unique attributes
count_dct1 = Similar.groupby('Provider').count()
print(unique_grp.shape)
print(unique_atr.shape)
combos = list(permutations(unique_grp, 2)) # get all combinations of the groups
comp_df = pd.DataFrame(data = (combos), columns = ['Group','LikeGroup']) # create the array to put comparison data into
comp_df['CommonWords'] = 0
i = 1
for atr in unique_atr:
print("i step: " + str(i))
i = i + 1
temp_df = Similar[Similar['AttendingPhysician'] == atr] # break dataframe into pieces that only contain the attribute being looked at during that iteration
myl = list(permutations(temp_df['Provider'],2)) # returns the pairs that have the attribute in common as a tuple
j = 0
for comb in myl:
comp_df.loc[(comp_df['Group'] == comb[0]) & (comp_df['LikeGroup'] == comb[1]), 'CommonWords'] += 1 # increments the CommonWords column where the Group column is equal to the first entry in the previously mentioned tuple, and the LikeGroup column is equal to the second entry.
k = 0
print('i OVER')
#for key, val in count_dct.items(): # put the previously computed TotalCount into the comparison dataframe
# print('k step:' + str(k))
# comp_df.loc[comp_df['Provider'] == key, 'TotalCount'] = val
#comp_df['PCT'] = (comp_df['CommonWords'] * 100.0 / comp_df['TotalCount']).round()
comp_df1 = comp_df
comp_df1 = comp_df1.merge(fraud, how = 'left', left_on='Group', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"PotentialFraud": "GroupFraud"})
comp_df1 = comp_df1.merge(fraud, how = 'left', left_on='LikeGroup', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"PotentialFraud": "LikeGroupFraud"})
comp_df1 = comp_df1.merge(count_dct1, how = 'left', left_on='Group', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"AttendingPhysician": "TotalGroup"})
comp_df1 = comp_df1.merge(count_dct1, how = 'left', left_on='LikeGroup', right_on='Provider')
comp_df1 = comp_df1.rename(columns={"AttendingPhysician": "TotalLikeGroup"})
comp_df1['PCT'] = (comp_df1['CommonWords'] * 100.0 / comp_df1['TotalGroup']).round()
comp_df1.to_csv('ProvProvData.csv')
comp_df1.sort_values('CommonWords', ascending=False).head()
Groups = comp_df1.sort_values('CommonWords', ascending=False).head(30)
Groups['Group'].unique()
Groups['LikeGroup'].unique()
ProviderRel2 = claims[['Provider','AttendingPhysician','PotentialFraud' ]]
newGroup = ProviderRel2[(ProviderRel2['Provider'].isin(Groups['Group'].unique())) |
(ProviderRel2['Provider'].isin(Groups['LikeGroup'].unique()))].drop_duplicates()
newGroup.head()
newGroup.shape
FGa = nx.from_pandas_edgelist(newGroup, source='Provider',
target='AttendingPhysician')
fraud1 = fraud
fraud1 = fraud1.reset_index()
fraud1 = fraud1[fraud1['Provider'].isin(newGroup.Provider)]
Ben = newGroup[['AttendingPhysician','PotentialFraud']]
Ben.PotentialFraud = 'Att'
Ben = Ben.drop_duplicates()
Ben = Ben.rename(columns={"AttendingPhysician": "Provider"})
fraud2 = fraud1.append(Ben)
fraud1.head()
fraud2 = fraud2.set_index('Provider')
nx.set_node_attributes(FGa, name = 'Fraud', values = pd.Series(fraud2.PotentialFraud).to_dict())
#nx.set_node_attributes(FGa, 'gender', pd.Series(nodes.gender, index=nodes.node).to_dict())
nx.get_node_attributes(FGa, 'Fraud')
from itertools import count
plt.figure(figsize=(20,20))
# get unique groups
groups = set(nx.get_node_attributes(FGa,'Fraud').values())
mapping = dict(zip(sorted(groups),count()))
nodes = FGa.nodes()
colors = [mapping[FGa.node[n]['Fraud']] for n in nodes]
# drawing nodes and edges separately so we can capture collection for colobar
pos = nx.spring_layout(FGa)
ec = nx.draw_networkx_edges(FGa, pos, alpha=0.2)
nc = nx.draw_networkx_nodes(FGa, pos, nodelist=nodes, node_color=colors,
with_labels=False, node_size=10, cmap=plt.cm.jet)
#plt.colorbar(nc)
plt.axis('off')
plt.show()
len(FGa)
print(nx.info(FGa))
nx.get_node_attributes(FGa,'Fraud' )
#plt.figure(figsize=(20,20))
#nx.draw(FGa,
# with_labels = False,
# font_size = 5,
# node_color=['red', 'blue', 'orange'],
# node_size = 200
# )
#plt.show()
plt.figure(figsize=(20,20))
nx.draw(FGa,
with_labels = False,
font_size = 5,
#node_color=['red', 'blue'],
node_size = 100
)
plt.show()
FG = nx.from_pandas_edgelist(ProviderRel.head(10000), source='Provider',
target='BeneID', edge_attr=True)
len(FG)
print(nx.info(FG))
plt.figure(figsize=(10,8))
nx.draw(FG)
plt.show()
Fraud Yes
FraudYes = ProviderRel[ProviderRel['PotentialFraud']== 'Yes']
FG1 = nx.from_pandas_edgelist(FraudYes, source='Provider',
target='BeneID')
print(nx.info(FG1))
plt.figure(figsize=(10,8))
nx.draw(FG1, with_labels = True, node_size = 5000, font_size = 20)
plt.show()
FraudYes['Provider'].value_counts()
OneProvider = FraudYes[FraudYes['Provider'] == 'PRV54895']
FG2 = nx.from_pandas_edgelist(OneProvider, source='Provider',
target='BeneID')
print(nx.info(FG2))
plt.figure(figsize=(10,8))
nx.draw(FG2)
plt.show()
Based on Provider and Attending Pysican
ProviderAtt = claims[['Provider','AttendingPhysician','PotentialFraud' ]]
ProviderAtt.head()
ProviderAtt.shape
FG3 = nx.from_pandas_edgelist(ProviderAtt, source='Provider',
target='AttendingPhysician', edge_attr=True)
print(nx.info(FG3))
#FG3.nodes(data=True)
#plt.figure(figsize=(10,8))
#nx.draw(FG3, node_color=['red', 'blue'])
#plt.show()